library(tidyverse)
library(ggplot2)
library(ggpubr)
library(plotly)
library(data.table)
library(rstatix)
library(countrycode)
library(car)data <- data.table(read.csv("gapminder_clean.csv")) # used read.csv to leverage ggpubr plotting
data <- data %>% mutate(continent = countrycode(
sourcevar = Country.Name,
origin = "country.name",
destination = "continent"
))
data[is.na(continent), continent := "Unknown"]To determine the relationship between normalized CO2
emissions and GDP per capita in 1962 we’ll use Pearson’s correlation.
We’ll do this by setting cor.coef=TRUE in
ggscatter (default method is Pearson’s correlation).
data %>%
filter(Year == 1962) %>%
ggscatter(
x = grep("emissions", colnames(data), value = TRUE),
y = "gdpPercap",
add = "reg.line",
cor.coef = TRUE
) +
labs(
x = expression(paste(CO[2], " emissions (metric tons per capita)")),
y = "GDP per Capita",
title = expression(paste("Correlation between ", CO[2], " emissions and GDP per capita"))
)Interpretation: As you can see from the Pearson correlation coefficient (R) normalized CO2 emissions and GDP per capita in 1962 were highly correlated
To determine what year normalized CO2 emissions and GDP
per capita were the most highly correlated we’ll group by year, then
correlate using Pearson’s correlation (cor’s default
method)
maxCorrYear <- data %>%
group_by(Year) %>%
mutate(
correlation = cor(
CO2.emissions..metric.tons.per.capita.,
gdpPercap,
use = "complete.obs"
)
) %>%
ungroup() %>%
filter(correlation == max(correlation, na.rm = TRUE))The highest correlation between normalized CO2 emissions and GDP per capita is 1967
p <- maxCorrYear %>%
ggscatter(
x = grep("emissions", colnames(data), value = TRUE),
y = "gdpPercap",
size = "pop",
color = "continent"
) +
labs(
x = "CO<sub>2</sub> emissions (metric tons per capita)",
y = "GDP per Capita",
title = "Correlation between CO<sub>2</sub> emissions and GDP per capita",
size = NULL,
color = "Continent"
)
ggplotly(p)We will complete the following steps to determine the relationship between continent and normalized energy usage:
country_energy_use <- data %>%
group_by(Country.Name) %>%
reframe(
mean_energy_use = na.omit(
mean(
Energy.use..kg.of.oil.equivalent.per.capita.,
na.rm = TRUE
)
)
)
energy_use <- merge(
country_energy_use,
data[!duplicated(Country.Name), c("Country.Name", "continent"), with = FALSE]
)normality <- energy_use %>%
group_by(continent) %>%
summarise(
statistic = shapiro.test(mean_energy_use)$statistic,
p_value = shapiro.test(mean_energy_use)$p.value
)
normality_tbl <- normality %>%
ggtexttable(
rows = NULL,
theme = ttheme("blank")
) %>%
tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
tab_add_hline(at.row = nrow(normality) + 1, row.side = "bottom", linewidth = 2)
normality_tblInterpretation: The energy usage within each continent deviates significantly from a normal distribution (p-values < 0.05)
variance <- leveneTest(mean_energy_use ~ continent, data = energy_use)Levene’s statistic: 1.6483968, Levene’s p-value: 0.1485942
Interpretation: The p-value is > 0.05 meaning the variances in energy usage across each continent are not significantly different
Since our dataset is non-normal but has equal variance we should deploy a non-parametric test that does not rely on the assumption of normality, like the Wilcoxon test. The Wilcoxon test will allow us to perform a pairwise comparison between the energy usage of each continent.
pairwise_result <- pairwise_wilcox_test(
data = energy_use,
formula = mean_energy_use ~ continent,
p.adjust.method = "none"
) %>%
rename("p.signif" = "p.adj.signif") %>%
select(group1, group2, p.signif)
tbl <- pairwise_result %>%
ggtexttable(
rows = NULL,
theme = ttheme("blank")
) %>%
tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
tab_add_hline(at.row = nrow(pairwise_result) + 1, row.side = "bottom", linewidth = 2)Order the continents by median energy use
x_axis_order <- energy_use %>%
group_by(continent) %>%
summarize(
median = median(mean_energy_use, na.rm = TRUE)
) %>%
arrange(median) %>%
pull(continent)Generate plot
p <- energy_use %>%
ggboxplot(
x = "continent",
y = "mean_energy_use",
color = "continent",
add = "jitter",
order = x_axis_order,
outlier.shape = NA,
) +
labs(
title = "Energy Use (kg oil equivalent per capita) by Continent",
x = NULL,
y = "Mean energy use (kg of oil equivalent per capita)"
)
p <- ggpar(p, legend = "none")
ggarrange(p, tbl, ncol = 2, nrow = 1)
Interpretation: The figure above indicates that Europe has the highest
per capita energy usage with Africa and Oceania tied for the lowest per
capita energy usage. The energy usage of the Americas, Asia, and
entities not associated with a continent fall somewhere in between.
We will complete the following steps to determine if there a significant difference between Europe and Asia with respect to imports in the years after 1990:
country_percent_imports <- data %>%
filter(continent %in% c("Europe", "Asia") & Year >= 1990) %>%
group_by(Country.Name) %>%
reframe(
mean_import_perc = na.omit(
mean(
Imports.of.goods.and.services....of.GDP.,
na.rm = TRUE
)
)
)
percent_imports <- merge(
country_percent_imports,
data[!duplicated(Country.Name), c("Country.Name", "continent"), with = FALSE]
)normality <- percent_imports %>%
group_by(continent) %>%
summarise(
statistic = shapiro.test(mean_import_perc)$statistic,
p_value = shapiro.test(mean_import_perc)$p.value
)
normality_tbl <- normality %>%
ggtexttable(
rows = NULL,
theme = ttheme("blank")
) %>%
tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
tab_add_hline(at.row = nrow(normality) + 1, row.side = "bottom", linewidth = 2)
normality_tblInterpretation: Import percentage for each continent deviates significantly from a normal distribution (p-values < 0.05)
variance <- leveneTest(mean_import_perc ~ continent, data = percent_imports)Levene’s statistic: 1.1410734, Levene’s p-value: 0.2883807
Interpretation: The p-value is > 0.05 meaning the variances in import percentage across each continent are not significantly different
Since our dataset is non-normal but has equal variance we should
deploy a non-parametric test that does not rely on the assumption of
normality, like the Wilcoxon test. stat_compare_means will
automatically perform this calculation and plot the result on our
boxplot.
percent_imports %>%
ggboxplot(
x = "continent",
y = "mean_import_perc",
add = "jitter",
xlab = FALSE,
ylab = "Mean import percentage of goods and services",
title = "Import percentage of goods and services between Asia and Europe post 1990",
outlier.shape = NA
) +
stat_compare_means()As you can see from the Wilcoxon p-value (> 0.05), there is no significant difference between Europe and Asia with respect to the import percentage in the years after 1990
country_highest_pop <- data %>%
group_by(Country.Name) %>%
summarize(
mean = mean(Population.density..people.per.sq..km.of.land.area., na.rm = TRUE)
) %>%
filter(mean == max(mean, na.rm = TRUE)) %>%
pull(Country.Name)The countries with the highest population density per sq km of land are Macao SAR and China
country_highest_life <- data %>%
filter(Year >= 1962 & Year <= 2007) %>%
select(Country.Name, Year, Life.expectancy.at.birth..total..years.) %>%
spread(Year, Life.expectancy.at.birth..total..years.) %>%
mutate(life_expectancy_increase = `2007` - `1962`) %>%
filter(life_expectancy_increase == max(life_expectancy_increase, na.rm = TRUE)) %>%
pull(Country.Name)The country with the greatest increase in life expectancy at birth between 1962 and 2007 is the Maldives